[Deep_Learning]Word_Embedding(Kor)

FastText wiki pre_trained_model

In [1]:
from __future__ import print_function
from gensim.models import KeyedVectors
In [2]:
# 위키 vector 모델 로드
ko_model = KeyedVectors.load_word2vec_format('wiki.ko.vec')
In [3]:
ko_model.save('ko_model')
In [4]:
# 필요없는 메모리 unload
ko_model.init_sims(replace=True)
In [5]:
# words 리스트에 추가
words = []
for word in ko_model.vocab:
    words.append(word)
In [6]:
find_similar_to = '사랑'
In [7]:
# '사랑'에 가장 유사한 단어 10개 추출
for similar_word in ko_model.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))
Word: 사랑사랑, Similarity: 0.81
Word: 사랑치, Similarity: 0.78
Word: 사랑일, Similarity: 0.77
Word: 사랑느낌, Similarity: 0.76
Word: 사랑이었네, Similarity: 0.76
Word: 사랑이여, Similarity: 0.75
Word: 사랑병, Similarity: 0.75
Word: 사랑인, Similarity: 0.75
Word: 사랑맛, Similarity: 0.75
Word: 사랑노래, Similarity: 0.74
In [8]:
word_add = ['동물', '파충류']
word_sub = ['뱀']
In [9]:
# 네거티브 샘플링
for resultant_word in ko_model.most_similar(
    positive=word_add, negative=word_sub
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))
Word : 포유류 , Similarity: 0.72
Word : 포유동물 , Similarity: 0.71
Word : 절지동물 , Similarity: 0.69
Word : 양서류 , Similarity: 0.69
Word : 독동물 , Similarity: 0.69
Word : 포유류분류 , Similarity: 0.68
Word : 무척추동물 , Similarity: 0.68
Word : 척추동물분류 , Similarity: 0.68
Word : 도시동물 , Similarity: 0.68
Word : 동물상 , Similarity: 0.67
In [10]:
similarities = ko_model.most_similar(positive=['동물', '파충류'], negative=['뱀'])
In [11]:
print(similarities)
[('포유류', 0.7234190702438354), ('포유동물', 0.7082793712615967), ('절지동물', 0.6905428171157837), ('양서류', 0.6887608766555786), ('독동물', 0.6857677698135376), ('포유류분류', 0.6800143718719482), ('무척추동물', 0.6791884899139404), ('척추동물분류', 0.6789263486862183), ('도시동물', 0.6775411367416382), ('동물상', 0.6730656623840332)]
In [12]:
# 유사도가 떨어지는 단어 찾기
not_matching = ko_model.doesnt_match("아침 점심 저녁 된장국".split())
C:\Users\BowlMin\Anaconda3\envs\py36\lib\site-packages\gensim\models\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
In [13]:
print(not_matching)
된장국
In [14]:
# 유사도 점수
sim_score = ko_model.similarity('컴퓨터', '인간')
In [15]:
print(sim_score)
0.4248201
In [16]:
sim_score = ko_model.similarity('로봇', '인간')
In [17]:
print(sim_score)
0.4782262
In [18]:
sim_score = ko_model.similarity('사랑해', '사랑의')
In [19]:
print(sim_score)
0.5480147
In [20]:
# 가장 유사한 단어 10개 추출
print(ko_model.most_similar('전자'))
[('전자빔', 0.7568391561508179), ('전자렌지', 0.7566049098968506), ('전자기기', 0.7503113746643066), ('전자양', 0.7480576634407043), ('가전자', 0.7460261583328247), ('전자악기', 0.7431635856628418), ('전자기기와', 0.7412865161895752), ('전자기계', 0.7339802980422974), ('전자만', 0.7331153154373169), ('전자적인', 0.7289555072784424)]

FastText wiki pre_trained_model

In [21]:
from gensim.models import FastText
In [22]:
ft2 = FastText.load_fasttext_format('wiki.ko.bin')
C:\Users\BowlMin\Anaconda3\envs\py36\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `load_fasttext_format` (use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model (to continue training with the loaded full model, more RAM) instead).
  """Entry point for launching an IPython kernel.
In [23]:
ko_model.save('ft2')
In [24]:
ft2.init_sims(replace=True)
In [25]:
find_similar_to = '사랑'
In [26]:
for similar_word in ft2.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))
C:\Users\BowlMin\Anaconda3\envs\py36\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `similar_by_word` (Method will be removed in 4.0.0, use self.wv.similar_by_word() instead).
  """Entry point for launching an IPython kernel.
Word: 사랑사랑, Similarity: 0.81
Word: 사랑치, Similarity: 0.78
Word: 사랑일, Similarity: 0.77
Word: 사랑느낌, Similarity: 0.76
Word: 사랑이었네, Similarity: 0.76
Word: 사랑이여, Similarity: 0.75
Word: 사랑병, Similarity: 0.75
Word: 사랑인, Similarity: 0.75
Word: 사랑맛, Similarity: 0.75
Word: 사랑노래, Similarity: 0.74
In [27]:
word_add = ['동물', '파충류']
word_sub = ['뱀']
In [28]:
for resultant_word in ft2.wv.most_similar(
    positive=word_add, negative=word_sub
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))
Word : 포유류 , Similarity: 0.72
Word : 포유동물 , Similarity: 0.71
Word : 절지동물 , Similarity: 0.69
Word : 양서류 , Similarity: 0.69
Word : 독동물 , Similarity: 0.69
Word : 포유류분류 , Similarity: 0.68
Word : 무척추동물 , Similarity: 0.68
Word : 척추동물분류 , Similarity: 0.68
Word : 도시동물 , Similarity: 0.68
Word : 동물상 , Similarity: 0.67
In [29]:
similarities = ft2.wv.most_similar(positive=['동물', '파충류'], negative=['뱀'])
In [30]:
print(similarities)
[('포유류', 0.7234194874763489), ('포유동물', 0.7082797884941101), ('절지동물', 0.6905421018600464), ('양서류', 0.6887608766555786), ('독동물', 0.6857682466506958), ('포유류분류', 0.6800158023834229), ('무척추동물', 0.6791882514953613), ('척추동물분류', 0.6789271831512451), ('도시동물', 0.6775408983230591), ('동물상', 0.6730659008026123)]
In [31]:
not_matching = ft2.wv.doesnt_match("아침 점심 저녁 된장국".split())
C:\Users\BowlMin\Anaconda3\envs\py36\lib\site-packages\gensim\models\keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
In [32]:
print(not_matching)
된장국
In [33]:
sim_score = ft2.wv.similarity('컴퓨터', '인간')
In [34]:
print(sim_score)
0.42482015
In [35]:
sim_score = ft2.wv.similarity('로봇', '인간')
In [36]:
print(sim_score)
0.47822645
In [37]:
sim_score = ft2.wv.similarity('사랑해', '사랑의')
In [38]:
print(sim_score)
0.54801536
In [39]:
print(ft2.wv.most_similar('전자'))
[('전자빔', 0.7568387985229492), ('전자렌지', 0.7566049695014954), ('전자기기', 0.750311017036438), ('전자양', 0.7480573654174805), ('가전자', 0.7460258603096008), ('전자악기', 0.743162989616394), ('전자기기와', 0.7412854433059692), ('전자기계', 0.7339791059494019), ('전자만', 0.7331157922744751), ('전자적인', 0.7289554476737976)]

Word2Vec wiki pre_trained_model

In [40]:
import gensim
In [41]:
ko_w2v = gensim.models.Word2Vec.load('ko.bin')
In [42]:
ko_w2v.init_sims(replace=True)
In [43]:
similarities_wv = ko_w2v.wv.most_similar(positive=['동물', '파충류'], negative=['뱀'])
In [44]:
print(similarities_wv)
[('생물', 0.6952868700027466), ('영장류', 0.6766470670700073), ('조류', 0.6660945415496826), ('양서류', 0.6637342572212219), ('포유류', 0.659113347530365), ('설치류', 0.636635422706604), ('무척추', 0.6241835355758667), ('어류', 0.6236225366592407), ('절지', 0.6208628416061401), ('곤충', 0.6167744398117065)]
In [45]:
sim_score_wv = ko_w2v.wv.similarity('컴퓨터', '인간')
In [46]:
print(sim_score_wv)
0.21644185
In [47]:
sim_score_wv = ko_w2v.wv.similarity('로봇', '인간')
In [48]:
print(sim_score_wv)
0.40642476
In [49]:
print(ko_w2v.wv.most_similar(positive=["전자"], topn=10))
[('반도체', 0.6502741575241089), ('양전자', 0.6052197217941284), ('복사기', 0.5808517336845398), ('음전하', 0.5768587589263916), ('원자가', 0.5756815671920776), ('음극', 0.5747135281562805), ('양전하', 0.5658353567123413), ('절연체', 0.5621837377548218), ('상거래', 0.5594459772109985), ('광자', 0.5468275547027588)]

GloVe wiki pre_trained_model

In [50]:
from soynlp.utils import DoublespaceLineCorpus
from soynlp.vectorizer import sent_to_word_contexts_matrix
In [51]:
corpus_path = '2016-10-20_article_all_normed.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
In [52]:
# text 파일로 매트릭스 생성
x, idx2vocab = sent_to_word_contexts_matrix(
    corpus,
    windows=3,
    min_tf=10,
    tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
    dynamic_weight=True,
    verbose=True)
print(x.shape)
Create (word, contexts) matrix
  - counting word frequency from 223356 sents, mem=3.380 Gb
  - scanning (word, context) pairs from 223356 sents, mem=3.899 Gb
  - (word, context) matrix was constructed. shape = (50091, 50091)                    
  - done
(50091, 50091)
In [53]:
from glove import Glove
In [54]:
# 모델 생성
glove = Glove(no_components=100, learning_rate=0.05, max_count=30)
glove.fit(x.tocoo(), epochs=5, no_threads=4, verbose=True)
Performing 5 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
In [55]:
# 단어 사전 만들기
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)
In [56]:
print(glove.most_similar('사랑', number=10))
[('길방', 0.9523622311165693), ('이문세의', 0.7709299671167765), ('일과', 0.7067012792878986), ('백선생2', 0.6770944696844872), ('월드지수', 0.6588736838372226), ('취미는', 0.6578404706277452), ('삼성화재가', 0.64967057830683), ('마스터', 0.6404152929079104), ('뽀꼬', 0.6362728223085667)]
In [57]:
print(glove.most_similar('동물', number=10))
[('애호', 0.8991982603293193), ('뉴욕에서', 0.8788155717188214), ('대사관', 0.8728831818746616), ('캐나다', 0.8563384980254631), ('아마존', 0.8560863828784364), ('태국', 0.8543265546091593), ('전역에서', 0.844629117471427), ('텍사스', 0.8327472727116574), ('등에서도', 0.814652839733725)]
In [58]:
print(glove.most_similar('로봇', number=10))
[('전문기자', 0.8650349594862273), ('기사입니다', 0.8502601533196349), ('풍선효과', 0.8335515748947783), ('명물로', 0.8163664744891702), ('증시분석', 0.807058672084537), ('등장했다고', 0.8057628957506581), ('지정하는', 0.8020943361293313), ('해안가', 0.794010006944613), ('씽크풀의', 0.785495111290533)]

kor2vec pre_trained_model

In [59]:
#from kor2vec import Kor2Vec
In [60]:
#kor2vec = Kor2Vec(embed_size=128)
In [61]:
#kor2vec.train("2016-10-20_article_all_normed.txt")
In [ ]: